In [1]:
#!/usr/bin/env python
"""xgboost shap and lime"""
Out[1]:
'xgboost shap and lime'
In [2]:
# parameter
MODEL = "onTravelV6C"
N_SAMPLES = 500
TRAIN_DATA_FILE = "train_" + MODEL + ".txt"
SAMPLE_FILE = "sample_train_" + MODEL + ".txt"
FEATURE_MAP_FILE = "feature_map_" + MODEL + ".json"
MODEL_FILE = MODEL + ".bin"
SAMPLE_FILE = "sample_" + str(N_SAMPLES) + "_" + TRAIN_DATA_FILE
In [3]:
%%bash
# prepare

# parameter
MODEL="onTravelV6C"
N_SAMPLES=500
TRAIN_DATA_FILE="train_${MODEL}.txt"
SAMPLE_FILE="sample_train_${MODEL}.txt"
FEATURE_MAP_FILE="feature_map_${MODEL}.json"
MODEL_FILE="${MODEL}.bin"
SAMPLE_FILE="sample_${N_SAMPLES}_${TRAIN_DATA_FILE}"

# train data file
if [[ ! -f ${TRAIN_DATA_FILE} ]]; then
    echo "Train Data File Not Exist"
    echo "Copy File Begin"
    cp /mfw_data/algo/wanglei/spark_offline/train_data/onTravel/${TRAIN_DATA_FILE} ./
    echo "Copy File End"
fi

# feature map data file
if [[ ! -f ${FEATURE_MAP_FILE} ]]; then
    echo "Feature Map File Not Exist"
    echo "Get File Begin"
    hadoop fs -text /user/wanglei3/featureMap/onTravel/${MODEL}/part-00000.snappy > ${FEATURE_MAP_FILE}
    echo "Get File End"
fi

# xgboost model file
if [[ ! -f ${MODEL_FILE} ]]; then
    echo "Model File Not Exist"
    echo "Copy File Begin"
    cp /opt/tomcat/webapps/model/${MODEL} ./
    mv ${MODEL} ${MODEL}.bin
    echo "Copy File End"
fi

# random sampling
if [[ ! -f ${SAMPLE_FILE} ]]; then
    echo "Sample File Not Exist"
    echo "Sampling Begin"
    shuf -n ${N_SAMPLES} ${TRAIN_DATA_FILE} -o sample_${N_SAMPLES}_${TRAIN_DATA_FILE}
    echo "Sampling End"
fi

ls
Train Data File Not Exist
Copy File Begin
Copy File End
Feature Map File Not Exist
Get File Begin
Get File End
Model File Not Exist
Copy File Begin
Copy File End
Sample File Not Exist
Sampling Begin
Sampling End
feature_map_onTravelV6C.json
onTravelV6C.bin
sample_500_train_onTravelV6C.txt
train_onTravelV6C.txt
xgboost-shap-and-lime.ipynb
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/local/datacenter/hadoop_5.16.1_1/share/hadoop/common/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/datacenter/hbase/lib/phoenix-4.9.0-cdh5.9.1-client.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/local/datacenter/hbase/lib/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
19/07/28 02:57:17 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
In [4]:
# ipython core option  
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [5]:
# package
from sklearn.datasets import load_svmlight_file
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn")
import shap
import lime
import json
import re
In [6]:
# feature map
with open(FEATURE_MAP_FILE) as fp:
    feature_map = json.load(fp)
cols = []
i = 0
for fm in feature_map:
    if i == 0:
        pass
    else:
        print(fm)
        cols.append(re.search(r"\t(.*)\t", fm).group(1))
    i += 1   
1	lat	q
2	lng	q
3	doubleFlow_user_view_3	q
4	doubleFlow_weng_user_view_3	q
5	doubleFlow_travel_user_view_3	q
6	doubleFlow_user_click_3	q
7	doubleFlow_poi_norm_click_3	q
8	doubleFlow_weng_user_click_3	q
9	doubleFlow_travel_user_click_3	q
10	doubleFlow_user_view_1	q
11	doubleFlow_weng_user_view_1	q
12	doubleFlow_travel_user_view_1	q
13	doubleFlow_user_click_1	q
14	doubleFlow_poi_norm_click_1	q
15	doubleFlow_weng_user_click_1	q
16	doubleFlow_travel_user_click_1	q
17	doubleFlow_user_view_7	q
18	doubleFlow_weng_user_view_7	q
19	doubleFlow_travel_user_view_7	q
20	doubleFlow_user_click_7	q
21	doubleFlow_poi_norm_click_7	q
22	doubleFlow_weng_user_click_7	q
23	doubleFlow_travel_user_click_7	q
24	doubleFlow_user_view_30	q
25	doubleFlow_weng_user_view_30	q
26	doubleFlow_travel_user_view_30	q
27	doubleFlow_user_click_30	q
28	doubleFlow_poi_norm_click_30	q
29	doubleFlow_weng_user_click_30	q
30	doubleFlow_travel_user_click_30	q
31	click_emb_90_norm	q
32	click_emb_30_norm	q
33	click_emb_7_norm	q
34	click_emb_1_norm	q
35	doubleFlow_mdd_avg_click_30_norm	q
36	doubleFlow_mdd_avg_click_7_norm	q
37	doubleFlow_mdd_avg_click_3_norm	q
38	doubleFlow_mdd_avg_click_1_norm	q
39	doubleFlow_mdd_loc_avg_click_30_norm	q
40	doubleFlow_mdd_loc_avg_click_7_norm	q
41	doubleFlow_mdd_loc_avg_click_3_norm	q
42	doubleFlow_mdd_loc_avg_click_1_norm	q
43	music	q
44	mdd_hot	q
45	doubleFlow_article_view_1	q
46	doubleFlow_article_click_1	q
47	doubleFlow_article_view_3	q
48	doubleFlow_article_click_3	q
49	doubleFlow_article_view_7	q
50	doubleFlow_article_click_7	q
51	doubleFlow_article_view_30	q
52	doubleFlow_article_click_30	q
53	item_norm	q
54	mdd_vector_norm	q
55	index_intention_mdd_cnt_7	q
56	index_intention_mdd_cnt_30	q
57	ui_cosine_90	q
58	ui_cosine_30	q
59	ui_cosine_70	q
60	mdd_avg_30_cosine	q
61	mdd_avg_7_cosine	q
62	mdd_avg_3_cosine	q
63	mdd_avg_1_cosine	q
64	mdd_avg_30_loc_cosine	q
65	mdd_avg_7_loc_cosine	q
66	mdd_avg_3_loc_cosine	q
67	mdd_avg_1_loc_cosine	q
68	doubleFlow_article_ctr_1_v1	q
69	doubleFlow_article_ctr_3_v1	q
70	doubleFlow_article_ctr_7_v1	q
71	doubleFlow_article_ctr_30_v1	q
72	event_week=0	i
73	event_week=1	i
74	event_week=2	i
75	event_week=3	i
76	event_week=4	i
77	event_week=5	i
78	event_week=6	i
79	item_type=0	i
80	item_type=1	i
81	item_type=6	i
82	event_hour=0	i
83	event_hour=1	i
84	event_hour=2	i
85	event_hour=3	i
86	event_hour=4	i
87	event_hour=5	i
88	event_hour=6	i
89	event_hour=7	i
90	event_hour=8	i
91	event_hour=9	i
92	event_hour=10	i
93	event_hour=11	i
94	event_hour=12	i
95	event_hour=13	i
96	event_hour=14	i
97	event_hour=15	i
98	event_hour=16	i
99	event_hour=17	i
100	event_hour=18	i
101	event_hour=19	i
102	event_hour=20	i
103	event_hour=21	i
104	event_hour=22	i
105	event_hour=23	i
106	event_month=1	i
107	event_month=2	i
108	event_month=3	i
109	event_month=4	i
110	event_month=5	i
111	event_month=6	i
112	event_month=7	i
113	event_month=8	i
114	event_month=9	i
115	event_month=10	i
116	event_month=11	i
117	event_month=12	i
In [7]:
# load libsvm format file
X, y = load_svmlight_file(SAMPLE_FILE, n_features=len(cols))
print(X[0].todense().shape)
print(y[0])
(1, 117)
0.0
In [8]:
# create dataframe
df = pd.DataFrame(X.todense())
df.columns = cols
df["repair"] = np.zeros(N_SAMPLES)
df["label"] = y
df = df[["repair"]+cols+["label"]]
df.head()
Out[8]:
repair lat lng doubleFlow_user_view_3 doubleFlow_weng_user_view_3 doubleFlow_travel_user_view_3 doubleFlow_user_click_3 doubleFlow_poi_norm_click_3 doubleFlow_weng_user_click_3 doubleFlow_travel_user_click_3 ... event_month=4 event_month=5 event_month=6 event_month=7 event_month=8 event_month=9 event_month=10 event_month=11 event_month=12 label
0 0.0 26.872812 112.495817 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 34.644576 112.407969 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 47.247368 127.113634 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 31.023395 121.411307 93.0 66.0 7.0 8.0 2.44949 5.0 1.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 119 columns

In [9]:
IS_TRAIN = False
In [10]:
# train xgboost model
if IS_TRAIN:
#     from sklearn.ensemble import GradientBoostingClassifier
#     param = {
#         "loss": "deviance",
#         "learning_rate": 0.1,
#         "max_depth": 7,
#         "subsample": 0.8,
#         "n_estimators": 300
#     }
#     sk_gbt = GradientBoostingClassifier(**param)
#     sk_gbt.fit(df[["repair"]+cols], df["label"])

#     param = {
#         "objective": "binary:logistic",
#         "learning_rate": 0.1,
#         "max_depth": 7,
#         "min_child_weight": 1,
#         "gamma": 0,
#         "subsample": 0.8,
#         "colsample_bytree": 0.8,
#         "scale_pos_weight": 1,
#         "n_estimators": 300,
#     }
#     sk_xgb = xgb.XGBClassifier(**param)
#     sk_xgb.fit(df[["repair"]+cols], df["label"])

    param = {
        "objective": "binary:logistic",
        "eta": 0.1,
        "max_depth": 7,
        "min_child_weight": 1,
        "gamma": 0,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "scale_pos_weight": 1,
        "silent": True
    }
    num_boost_round = 300
    dtrain = xgb.DMatrix(df[["repair"]+cols], label=df["label"])
    bst_xgb = xgb.train(param, dtrain, num_boost_round=num_boost_round)
else:
    bst = xgb.Booster(model_file=MODEL_FILE)
In [11]:
if IS_TRAIN == True:
    model = bst_xgb
else:
    model = bst
In [16]:
# margin or probability
MODEL_OUTPUT = "probability"
In [17]:
# shap
if MODEL_OUTPUT == "margin":
    # margin explanation
    shap_explainer = shap.TreeExplainer(model)
if MODEL_OUTPUT == "probability":
    # probability explanation
    BACKGROUND_DATASET_SIZE = 1000
    if len(df[["repair"]+cols]) <= BACKGROUND_DATASET_SIZE:
        background_dataset = df[["repair"]+cols]
    else:
        background_dataset = df[["repair"]+cols].sample(BACKGROUND_DATASET_SIZE)
    shap_explainer = shap.TreeExplainer(model, background_dataset.values, model_output="probability", feature_dependence="independent")
In [20]:
shap_values = shap_explainer.shap_values(df[["repair"]+cols])
print("shap_values: ", shap_values.shape)
y_base = shap_explainer.expected_value
print("y_base: ", y_base)
100%|===================| 498/500 [00:39<00:00]        
shap_values:  (500, 118)
y_base:  0.24321979340685262
In [22]:
if MODEL_OUTPUT == "margin":
    # margin explanation
    df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=True)
if MODEL_OUTPUT == "probability":
    # probability explanation
    df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=False)
print("pred mean: ", df["pred"].mean())
df.head()
pred mean:  0.2415926605463028
Out[22]:
repair lat lng doubleFlow_user_view_3 doubleFlow_weng_user_view_3 doubleFlow_travel_user_view_3 doubleFlow_user_click_3 doubleFlow_poi_norm_click_3 doubleFlow_weng_user_click_3 doubleFlow_travel_user_click_3 ... event_month=5 event_month=6 event_month=7 event_month=8 event_month=9 event_month=10 event_month=11 event_month=12 label pred
0 0.0 26.872812 112.495817 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.030440
1 0.0 34.644576 112.407969 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.364603
2 0.0 47.247368 127.113634 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.290305
3 0.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.513296
4 0.0 31.023395 121.411307 93.0 66.0 7.0 8.0 2.44949 5.0 1.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.212322

5 rows × 120 columns

In [23]:
shap.force_plot(shap_explainer.expected_value, shap_values, df[["repair"]+cols])
Out[23]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [24]:
shap.summary_plot(shap_values, df[["repair"]+cols], plot_type="bar")
In [25]:
shap.summary_plot(shap_values, df[["repair"]+cols])
In [26]:
if MODEL_OUTPUT == "margin":
    shap_interaction_values = shap_explainer.shap_interaction_values(df[["repair"]+cols])
    shap.summary_plot(shap_interaction_values, df[["repair"]+cols], max_display=4)
In [27]:
# j = np.random.randint(N_SAMPLES)
In [34]:
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[i]))
negative sample
Out[34]:
feature feature_value shap_value
0 repair 0.000000 0.000000
1 lat 34.274354 0.003108
2 lng 109.059910 -0.023467
3 doubleFlow_user_view_3 4.000000 0.008655
4 doubleFlow_weng_user_view_3 4.000000 -0.000028
5 doubleFlow_travel_user_view_3 0.000000 0.000070
6 doubleFlow_user_click_3 1.000000 -0.001437
7 doubleFlow_poi_norm_click_3 0.000000 0.000508
8 doubleFlow_weng_user_click_3 1.000000 -0.000192
9 doubleFlow_travel_user_click_3 0.000000 -0.000125
10 doubleFlow_user_view_1 0.000000 0.002851
11 doubleFlow_weng_user_view_1 0.000000 -0.003075
12 doubleFlow_travel_user_view_1 0.000000 0.000174
13 doubleFlow_user_click_1 0.000000 -0.000960
14 doubleFlow_poi_norm_click_1 0.000000 -0.002248
15 doubleFlow_weng_user_click_1 0.000000 -0.001976
16 doubleFlow_travel_user_click_1 0.000000 -0.000047
17 doubleFlow_user_view_7 4.000000 0.018156
18 doubleFlow_weng_user_view_7 4.000000 0.005504
19 doubleFlow_travel_user_view_7 0.000000 0.007487
20 doubleFlow_user_click_7 1.000000 -0.005037
21 doubleFlow_poi_norm_click_7 0.000000 -0.000070
22 doubleFlow_weng_user_click_7 1.000000 -0.000387
23 doubleFlow_travel_user_click_7 0.000000 -0.000273
24 doubleFlow_user_view_30 4.000000 0.028969
25 doubleFlow_weng_user_view_30 4.000000 0.037439
26 doubleFlow_travel_user_view_30 0.000000 0.001835
27 doubleFlow_user_click_30 1.000000 -0.023074
28 doubleFlow_poi_norm_click_30 0.000000 0.002874
29 doubleFlow_weng_user_click_30 1.000000 -0.004629
... ... ... ...
88 event_hour=6 0.000000 0.000000
89 event_hour=7 0.000000 0.000000
90 event_hour=8 0.000000 0.000000
91 event_hour=9 0.000000 0.000000
92 event_hour=10 0.000000 0.000000
93 event_hour=11 0.000000 0.000000
94 event_hour=12 0.000000 0.000000
95 event_hour=13 0.000000 0.000000
96 event_hour=14 0.000000 0.000000
97 event_hour=15 0.000000 0.000000
98 event_hour=16 0.000000 0.000000
99 event_hour=17 0.000000 0.000000
100 event_hour=18 0.000000 0.000000
101 event_hour=19 0.000000 0.000000
102 event_hour=20 1.000000 0.000000
103 event_hour=21 0.000000 0.000000
104 event_hour=22 0.000000 0.000000
105 event_hour=23 0.000000 0.000000
106 event_month=1 0.000000 0.000000
107 event_month=2 0.000000 0.000000
108 event_month=3 0.000000 0.000000
109 event_month=4 0.000000 0.000000
110 event_month=5 0.000000 0.000000
111 event_month=6 0.000000 0.000000
112 event_month=7 1.000000 0.000000
113 event_month=8 0.000000 0.000000
114 event_month=9 0.000000 0.000000
115 event_month=10 0.000000 0.000000
116 event_month=11 0.000000 0.000000
117 event_month=12 0.000000 0.000000

118 rows × 3 columns

y_base + sum_of_shap_values: 0.31
y_pred: 0.30
In [29]:
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[i], df[["repair"]+cols].iloc[i])
Out[29]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [35]:
j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[j]))
positive sample
Out[35]:
feature feature_value shap_value
0 repair 0.0 0.000000
1 lat 0.0 0.038460
2 lng 0.0 0.054586
3 doubleFlow_user_view_3 0.0 0.008936
4 doubleFlow_weng_user_view_3 0.0 0.005120
5 doubleFlow_travel_user_view_3 0.0 0.004532
6 doubleFlow_user_click_3 0.0 -0.000929
7 doubleFlow_poi_norm_click_3 0.0 0.000170
8 doubleFlow_weng_user_click_3 0.0 0.001268
9 doubleFlow_travel_user_click_3 0.0 -0.000166
10 doubleFlow_user_view_1 0.0 0.002258
11 doubleFlow_weng_user_view_1 0.0 0.000652
12 doubleFlow_travel_user_view_1 0.0 -0.001401
13 doubleFlow_user_click_1 0.0 -0.000754
14 doubleFlow_poi_norm_click_1 0.0 -0.007516
15 doubleFlow_weng_user_click_1 0.0 -0.002728
16 doubleFlow_travel_user_click_1 0.0 0.000163
17 doubleFlow_user_view_7 0.0 0.049772
18 doubleFlow_weng_user_view_7 0.0 0.010798
19 doubleFlow_travel_user_view_7 0.0 0.014413
20 doubleFlow_user_click_7 0.0 -0.007900
21 doubleFlow_poi_norm_click_7 0.0 -0.000524
22 doubleFlow_weng_user_click_7 0.0 -0.005436
23 doubleFlow_travel_user_click_7 0.0 0.001110
24 doubleFlow_user_view_30 0.0 0.033920
25 doubleFlow_weng_user_view_30 0.0 0.044260
26 doubleFlow_travel_user_view_30 0.0 0.006287
27 doubleFlow_user_click_30 0.0 -0.029860
28 doubleFlow_poi_norm_click_30 0.0 0.004882
29 doubleFlow_weng_user_click_30 0.0 -0.010792
... ... ... ...
88 event_hour=6 0.0 0.000000
89 event_hour=7 0.0 0.000000
90 event_hour=8 0.0 0.000000
91 event_hour=9 0.0 0.000000
92 event_hour=10 0.0 0.000000
93 event_hour=11 0.0 0.000000
94 event_hour=12 0.0 0.000000
95 event_hour=13 0.0 0.000000
96 event_hour=14 0.0 0.000000
97 event_hour=15 0.0 0.000000
98 event_hour=16 0.0 0.000000
99 event_hour=17 0.0 0.000000
100 event_hour=18 0.0 0.000000
101 event_hour=19 0.0 0.000000
102 event_hour=20 0.0 0.000000
103 event_hour=21 0.0 0.000000
104 event_hour=22 1.0 0.000000
105 event_hour=23 0.0 0.000000
106 event_month=1 0.0 0.000000
107 event_month=2 0.0 0.000000
108 event_month=3 0.0 0.000000
109 event_month=4 0.0 0.000000
110 event_month=5 0.0 0.000000
111 event_month=6 0.0 0.000000
112 event_month=7 1.0 0.000000
113 event_month=8 0.0 0.000000
114 event_month=9 0.0 0.000000
115 event_month=10 0.0 0.000000
116 event_month=11 0.0 0.000000
117 event_month=12 0.0 0.000000

118 rows × 3 columns

y_base + sum_of_shap_values: 0.78
y_pred: 0.76
In [36]:
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[j], df[["repair"]+cols].iloc[j])
Out[36]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [38]:
FEATURE="doubleFlow_article_ctr_30_v1"
INTERACTION="doubleFlow_user_view_30"
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=None, show=False)
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=INTERACTION, show=False) 
In [39]:
# lime
lime_explainer = lime.lime_tabular.LimeTabularExplainer(df[["repair"]+cols].values, 
                                                   feature_names=["repair"]+cols,
                                                   class_names=["0", "1"], 
                                                   verbose=True)
In [40]:
model.feature_names = None
def predict_fn(x):
    preds = model.predict(xgb.DMatrix(x))
    return np.array([[1-p, p] for p in preds])
In [41]:
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer
negative sample
Out[41]:
feature feature_value shap_value
0 repair 0.000000 0.000000
1 lat 41.661230 -0.004250
2 lng 123.089761 -0.005605
3 doubleFlow_user_view_3 0.000000 0.009743
4 doubleFlow_weng_user_view_3 0.000000 -0.000286
5 doubleFlow_travel_user_view_3 0.000000 0.001608
6 doubleFlow_user_click_3 0.000000 -0.001793
7 doubleFlow_poi_norm_click_3 0.000000 0.000063
8 doubleFlow_weng_user_click_3 0.000000 0.000682
9 doubleFlow_travel_user_click_3 0.000000 -0.000172
10 doubleFlow_user_view_1 0.000000 0.001553
11 doubleFlow_weng_user_view_1 0.000000 0.001871
12 doubleFlow_travel_user_view_1 0.000000 -0.001295
13 doubleFlow_user_click_1 0.000000 -0.000627
14 doubleFlow_poi_norm_click_1 0.000000 -0.001634
15 doubleFlow_weng_user_click_1 0.000000 -0.000882
16 doubleFlow_travel_user_click_1 0.000000 -0.000099
17 doubleFlow_user_view_7 0.000000 0.033112
18 doubleFlow_weng_user_view_7 0.000000 0.006703
19 doubleFlow_travel_user_view_7 0.000000 0.010458
20 doubleFlow_user_click_7 0.000000 -0.002556
21 doubleFlow_poi_norm_click_7 0.000000 -0.000220
22 doubleFlow_weng_user_click_7 0.000000 -0.006797
23 doubleFlow_travel_user_click_7 0.000000 0.000679
24 doubleFlow_user_view_30 0.000000 0.024333
25 doubleFlow_weng_user_view_30 0.000000 0.030723
26 doubleFlow_travel_user_view_30 0.000000 0.006206
27 doubleFlow_user_click_30 0.000000 -0.032430
28 doubleFlow_poi_norm_click_30 0.000000 0.003362
29 doubleFlow_weng_user_click_30 0.000000 -0.008083
... ... ... ...
88 event_hour=6 0.000000 0.000000
89 event_hour=7 0.000000 0.000000
90 event_hour=8 0.000000 0.000000
91 event_hour=9 0.000000 0.000000
92 event_hour=10 0.000000 0.000000
93 event_hour=11 0.000000 0.000000
94 event_hour=12 0.000000 0.000000
95 event_hour=13 0.000000 0.000000
96 event_hour=14 0.000000 0.000000
97 event_hour=15 0.000000 0.000000
98 event_hour=16 0.000000 0.000000
99 event_hour=17 0.000000 0.000000
100 event_hour=18 0.000000 0.000000
101 event_hour=19 1.000000 0.000000
102 event_hour=20 0.000000 0.000000
103 event_hour=21 0.000000 0.000000
104 event_hour=22 0.000000 0.000000
105 event_hour=23 0.000000 0.000000
106 event_month=1 0.000000 0.000000
107 event_month=2 0.000000 0.000000
108 event_month=3 0.000000 0.000000
109 event_month=4 0.000000 0.000000
110 event_month=5 0.000000 0.000000
111 event_month=6 0.000000 0.000000
112 event_month=7 1.000000 0.000000
113 event_month=8 0.000000 0.000000
114 event_month=9 0.000000 0.000000
115 event_month=10 0.000000 0.000000
116 event_month=11 0.000000 0.000000
117 event_month=12 0.000000 0.000000

118 rows × 3 columns

In [43]:
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[i], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)
Intercept 0.29527383528420614
Prediction_local [0.3280011]
Right: 0.2845465838909149
In [44]:
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()
Out[44]:
[('doubleFlow_user_click_30 <= 0.00', -0.11514450358511392), ('doubleFlow_user_view_7 <= 0.00', 0.08986872155226684), ('doubleFlow_article_ctr_30_v1 <= 0.03', -0.07945548816299527), ('doubleFlow_user_view_30 <= 0.00', 0.07438843163639652), ('doubleFlow_weng_user_view_30 <= 0.00', 0.06307010353457698)]
In [46]:
j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer
positive sample
Out[46]:
feature feature_value shap_value
0 repair 0.0 0.000000
1 lat 0.0 0.038460
2 lng 0.0 0.054586
3 doubleFlow_user_view_3 0.0 0.008936
4 doubleFlow_weng_user_view_3 0.0 0.005120
5 doubleFlow_travel_user_view_3 0.0 0.004532
6 doubleFlow_user_click_3 0.0 -0.000929
7 doubleFlow_poi_norm_click_3 0.0 0.000170
8 doubleFlow_weng_user_click_3 0.0 0.001268
9 doubleFlow_travel_user_click_3 0.0 -0.000166
10 doubleFlow_user_view_1 0.0 0.002258
11 doubleFlow_weng_user_view_1 0.0 0.000652
12 doubleFlow_travel_user_view_1 0.0 -0.001401
13 doubleFlow_user_click_1 0.0 -0.000754
14 doubleFlow_poi_norm_click_1 0.0 -0.007516
15 doubleFlow_weng_user_click_1 0.0 -0.002728
16 doubleFlow_travel_user_click_1 0.0 0.000163
17 doubleFlow_user_view_7 0.0 0.049772
18 doubleFlow_weng_user_view_7 0.0 0.010798
19 doubleFlow_travel_user_view_7 0.0 0.014413
20 doubleFlow_user_click_7 0.0 -0.007900
21 doubleFlow_poi_norm_click_7 0.0 -0.000524
22 doubleFlow_weng_user_click_7 0.0 -0.005436
23 doubleFlow_travel_user_click_7 0.0 0.001110
24 doubleFlow_user_view_30 0.0 0.033920
25 doubleFlow_weng_user_view_30 0.0 0.044260
26 doubleFlow_travel_user_view_30 0.0 0.006287
27 doubleFlow_user_click_30 0.0 -0.029860
28 doubleFlow_poi_norm_click_30 0.0 0.004882
29 doubleFlow_weng_user_click_30 0.0 -0.010792
... ... ... ...
88 event_hour=6 0.0 0.000000
89 event_hour=7 0.0 0.000000
90 event_hour=8 0.0 0.000000
91 event_hour=9 0.0 0.000000
92 event_hour=10 0.0 0.000000
93 event_hour=11 0.0 0.000000
94 event_hour=12 0.0 0.000000
95 event_hour=13 0.0 0.000000
96 event_hour=14 0.0 0.000000
97 event_hour=15 0.0 0.000000
98 event_hour=16 0.0 0.000000
99 event_hour=17 0.0 0.000000
100 event_hour=18 0.0 0.000000
101 event_hour=19 0.0 0.000000
102 event_hour=20 0.0 0.000000
103 event_hour=21 0.0 0.000000
104 event_hour=22 1.0 0.000000
105 event_hour=23 0.0 0.000000
106 event_month=1 0.0 0.000000
107 event_month=2 0.0 0.000000
108 event_month=3 0.0 0.000000
109 event_month=4 0.0 0.000000
110 event_month=5 0.0 0.000000
111 event_month=6 0.0 0.000000
112 event_month=7 1.0 0.000000
113 event_month=8 0.0 0.000000
114 event_month=9 0.0 0.000000
115 event_month=10 0.0 0.000000
116 event_month=11 0.0 0.000000
117 event_month=12 0.0 0.000000

118 rows × 3 columns

In [47]:
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[j], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)
Intercept 0.24858552903447084
Prediction_local [0.44815131]
Right: 0.7607649564743042
In [48]:
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()
Out[48]:
[('doubleFlow_user_click_30 <= 0.00', -0.11560679828953539), ('doubleFlow_user_view_7 <= 0.00', 0.10199723108604003), ('doubleFlow_article_ctr_30_v1 > 0.10', 0.07399143086476795), ('doubleFlow_weng_user_view_30 <= 0.00', 0.07209530902841806), ('doubleFlow_user_view_30 <= 0.00', 0.06708860392451839)]
In [ ]: